library(tidyverse)library(ggplot2)library(dplyr)library(leaflet)library(usmap)library(sf)library(readr)library(scales)library(tidyr)Arrest2023 <-read_csv("2023_arrest_data.csv", col_types =cols(ArresteeNumber =col_skip(), ArrestDate =col_date(format ="%m/%d/%Y"), ArrestTime =col_character(), WEB_ADDRESS =col_skip(), PHONE_NUMBER =col_skip(), NAME =col_skip()))## Remove any outliers by setting min/max lat and long.min_lat =38.6max_lat =39.0min_lon =-77.6max_lon =-77.0## Using all data but excluding any coordinates that are not within coordinatesarrest_data_clean = Arrest2023 %>%filter( Latitude >= min_lat, Latitude <= max_lat, Longitude >= min_lon, Longitude <= max_lon )## Include district boundaries by reading in shape file downloaded from Fairfax County sitedistrict_boundaries =st_read("Supervisor_Districts/Supervisor_Districts.shp")
Reading layer `Supervisor_Districts' from data source
`C:\Users\Nat\OneDrive - George Mason University - O365 Production\Git\nathaniams.github.io\Supervisor_Districts\Supervisor_Districts.shp'
using driver `ESRI Shapefile'
Simple feature collection with 9 features and 12 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: 11757190 ymin: 6905421 xmax: 11899000 ymax: 7070364
Projected CRS: NAD83 / Virginia North (ftUS)
## Transform to WGS84 projection to match basemap within leafletif (st_crs(district_boundaries)$epsg !=4326) { district_boundaries =st_transform(district_boundaries, 4326)}
## Using leaflet to map the Arrest data and include district layersleaflet() %>%addProviderTiles(providers$OpenStreetMap) %>%addPolygons(data = district_boundaries,fillOpacity =0.5,color ="black",weight =1,popup =~paste("District:", DISTRICT) ) %>%addCircleMarkers(data = arrest_data_clean,lng =~Longitude,lat =~Latitude,radius =3,color ="darkred",stroke =FALSE,fillOpacity =0.4,popup =~paste("Case Number: ", CaseNumber, "<br>","Arrest Type: ", IBRDescription) )
# By IBRCodearrest_count = Arrest2023 %>%group_by(IBRCode) %>%summarise(Count =n()) %>%ungroup() %>%arrange(desc(Count))# Get top 10top_10_arrest_IBR =head(arrest_count, 10)# Add codes - General DescriptionIBRCodeDetails =read.csv("IBRcodes.csv")# Reassign the codestop_10_arrest_decoded = top_10_arrest_IBR %>%left_join(IBRCodeDetails, by =c("IBRCode"="CODE")) %>%select(IBRCode, OFFENSE, everything())top_10_arrest_cleaned = top_10_arrest_decoded %>%distinct(IBRCode, OFFENSE, .keep_all =TRUE)# Decode for cleaner resultsarrest_chart =ggplot(top_10_arrest_cleaned, aes(x =reorder(OFFENSE, Count),y = Count)) +geom_bar(stat ="identity", fill ="skyblue") +coord_flip() +labs(title ="Top 10 Arrest Type by IBR decoded",x ="Offense by IBR Code",y ="Number of Arrests") +theme_grey() +geom_text(aes(label = Count), hjust =-0.1, size =3.5) +scale_y_continuous(expand =expansion(mult =c(0, 0.15)))arrest_chart
library(readr)library(lubridate)warnings =read_csv("2023_warning_data.csv", col_types =cols(Warnings_Date =col_date(format ="%m/%d/%Y"), WEB_ADDRESS =col_skip(), PHONE_NUMBER =col_skip(), NAME =col_skip()))citations =read_csv("2023_citation_data.csv", col_types =cols(Date =col_date(format ="%m/%d/%Y"), WEB_ADDRESS =col_skip(), PHONE_NUMBER =col_skip(), NAME =col_skip()))# Rename some columnscitations = citations %>%rename(ViolationDate = Date)# change Gender to sex in warnings and change date column namewarnings = warnings %>%rename(Sex = Gender)warnings = warnings %>%rename(ViolationDate = Warnings_Date)# Adjust Citations and prepare for Merge# Assumption that ID is the officer's IDcitations_processed = citations %>%mutate(outcome ="Citation",Gender =case_when( Sex =="M"~"Male", Sex =="F"~"Female",TRUE~"Other/Unknown" ),Year =year(ViolationDate),Month =month(ViolationDate),DayOfMonth =day(ViolationDate),Time =parse_date_time(Time, "HM"),data_type ="Citation" ) %>%select( outcome, Gender, Year, Month, DayOfMonth, Time, Offense_Description = Charge,District = DISTRICT, Race, Ethnicity, Latitude, Longitude, OfficerID = ID, data_type )# Adjust Warnings and prepare for Mergewarnings_processed = warnings %>%mutate(outcome ="Warning",Gender =case_when( Sex =="M"~"Male", Sex =="F"~"Female",TRUE~"Other/Unknown" ),Year =year(ViolationDate),Month =month(ViolationDate),DayOfMonth =day(ViolationDate),Time =parse_date_time(Time, "HM"),data_type ="Warning" ) %>%select( outcome, Gender, Year, Month, DayOfMonth, Time, Offense_Description, District = DISTRICT, Race, Ethnicity, Latitude = Lat, Longitude = Long, OfficerID = Officer_ID, data_type )# Combined for ultimate Data coordination!combined_wc =bind_rows(citations_processed, warnings_processed)# Add ultimate binary outcome! 0 = Citation, 1 = Warning/ Got out of ticketcombined_wc = combined_wc %>%mutate(BinaryOutcome =ifelse(outcome =="Warning", 1,0) )## Change to Title Case for District Namescombined_wc$District = tools::toTitleCase(tolower(combined_wc$District))## Examining Unverified data## After examination, unverified only makes up 0.0143 or 1.43% of the data set, so we will remove## because it is a very small portion of the total proportion. combined_wc %>%count(District) %>%mutate(Proportion = n /sum(n)) %>%arrange(desc(n))
# A tibble: 11 × 3
District n Proportion
<chr> <int> <dbl>
1 Sully 18612 0.208
2 Springfield 12581 0.140
3 Braddock 10292 0.115
4 Franconia 10033 0.112
5 Hunter Mill 8718 0.0972
6 Mason 8168 0.0911
7 Dranesville 7143 0.0797
8 Providence 6713 0.0749
9 Mount Vernon 6113 0.0682
10 Unverified 1281 0.0143
11 <NA> 1 0.0000112
## Filter out Unverified and NAcombined_wc = combined_wc %>%filter(District !="Unverified")combined_wc = combined_wc %>%filter(!is.na(District))## Filter out Other/Unknown Gendercombined_wc_mf = combined_wc %>%filter(Gender !="Other/Unknown")## Now for some visuals: Gender Chart## Examining the proportion of stops resulting in a Warning Vs Citation## the Warning rate is the proportion of incidents that are warnings.gender_warning_rate = combined_wc_mf %>%group_by(Gender) %>%summarise(Total_Incidents =n(),Warning_Rate =mean(BinaryOutcome) ) %>%ungroup()gender_chart =ggplot(gender_warning_rate,aes(x = Gender, y = Warning_Rate, fill = Gender)) +geom_col(show.legend =FALSE) +geom_text(aes(label = scales::percent(Warning_Rate, accuracy =0.1)),vjust =-0.5, size =5) +scale_y_continuous(labels = scales::percent, limits =c(0, max(gender_warning_rate$Warning_Rate) *1.1)) +labs(title ="Warning Rate by Gender",subtitle ="Proportion of stops resulting in a Warning (vs Citation)",x ="Gender",y ="Warning Rate" ) +theme_gray() +theme(plot.title =element_text(hjust =0.5)) +theme(plot.subtitle =element_text(hjust =0.5)) +scale_fill_manual(values =c("Female"="pink", "Male"="skyblue"))gender_chart
## Now the Chi-Squared Test starting with the Contingency Tablecontingency_tbl = combined_wc_mf %>%filter(Gender %in%c("Male", "Female")) %>%select(Gender, BinaryOutcome) %>%table()contingency_tbl
BinaryOutcome
Gender 0 1
Female 20478 8777
Male 43657 15408